Integrating zebrafish and adolescent mouse scRNAseq data (Linnarsson).
#install version 1.0.9 of dyplyr
#dplyrurl <- "https://cran.r-project.org/src/contrib/Archive/dplyr/dplyr_1.0.9.tar.gz"
#install.packages(dplyrurl, repos=NULL, type="source")
#install version 4.1.1 of seurat
#remotes::install_version("Seurat", version = "4.1.1")
#install loomR
#devtools::install_github("mojaveazure/loomR")
#install seuratdisk
#remotes::install_github(repo ='mojaveazure/seurat-disk')#load packages
library(dplyr) #tidyverse dataframe manipulation
library(Seurat) #scRNAseq analysis
library(patchwork) #organizes plots
library(loomR) #for reading loom files
library(SeuratDisk) #for converting from loom to seurat
library(Matrix) #convert matrix to sparse matrix
library(ggplot2) #graphing
library(sctransform) #sctransform normalization
library(harmony) #integration of data
library(scales) #colors
#set seed
set.seed(1234)
#in command line, put files back together
#cat l5_all.tar.00* >l5_all.tar
#in command line, untar
#tar -xvf l5_all.tar
#connect to loom file with all mouse data (read only)
#mousescrnaseq <- Connect(filename = "l5_loom/l5_all.loom", mode = "r")
#convert to Seurat file
#mousescrnaseqSO <- as.Seurat(mousescrnaseq)
#save Seurat object
#save(mousescrnaseqSO, file = "mousescRNAseq_all.RObj")
#close connection
#mousescrnaseq$close_all()
#load mouse object
load(file = "mousescRNAseq_all.RObj")#load zebrafish object
load(file = "forebrain_all_final.RObj")
#subset to only include wanted clusters in zebrafish
zebrafishclusterstokeep <- c("Pallium_01", "Pallium_02", "Pallium_03", "Pallium_04", "Pallium_05", "Pallium_06", "Pallium_07", "Pallium_08", "Pallium_09", "Subpallium_01", "Subpallium_02", "Subpallium_03", "Subpallium_04", "Subpallium_05", "Subpallium_06", "Subpallium_07", "Subpallium_08")
#subset zebrafish object to keep wanted clusters
forebrain.integrated <- subset(forebrain.integrated, subset= clusterInterp2 %in% zebrafishclusterstokeep)
#convert zebrafish genes to mouse genes
#forebrain.integrated@assays$RNA@counts <- orthogene::convert_orthologs(gene_df=forebrain.integrated@assays$RNA@counts,
#gene_input="rownames",
#gene_output="rownames",
#input_species="zebrafish",
#output_species="mouse",
#non121_strategy="kp",
#method="gprofiler")
#make matrix out of counts
#matrix <- forebrain.integrated@assays$RNA@counts
#aggregate rows with same mouse gene name
#matrix2 <- aggregate(matrix, list(row.names(matrix)), mean)
#rownames(matrix2) <- matrix2$Group.1
#matrix2 = subset(matrix2, select = -c(Group.1) )
#convert to sparse matrix
#matrix3 <- data.matrix(matrix2, rownames.force = TRUE)
#matrix3 <- as(matrix3, "dgCMatrix")
#put sparse matrix into seurat object
#forebrain.integrated@assays$RNA@counts <- matrix3
#make a copy of seurat object
#forebrain.integrated.mouse <- forebrain.integrated
#save Seurat object
#save(forebrain.integrated.mouse, file = "forebrain.integrated.mousenames.RObj")
#load zebrafish object with mouse names
load(file = "forebrain.integrated.mousenames.RObj")#subset mouse object to only include relevant clusters
mouseclusterstokeep <- c("TEGLU1", "TEGLU3", "TEGLU2", "TEGLU20", "TEGLU11", "TEGLU12", "TEGLU10", "TEGLU9", "TEGLU8", "TEGLU7", "TEGLU6", "TEGLU13", "TEGLU14", "TEGLU5", "TEGLU16", "TEGLU15", "TEGLU17", "TEGLU18", "TEGLU19", "TEGLU22", "TEGLU21", "TEGLU4", "TEGLU24", "TEGLU23", "CR", "DECHO1", "MSN1", "MSN2", "MSN3", "MSN4", "MSN5", "MSN6", "TEINH17", "TEINH18", "TEINH19", "TEINH21", "TEINH16", "TEINH15", "TEINH14", "TEINH20", "TEINH13", "TEINH12", "TEINH9", "TEINH10", "TEINH11", "TEINH4", "TEINH5", "TEINH8", "TEINH7", "TEINH6", "TECHO", "TEINH3", "TEINH2", "TEINH1")
#subset mouse object to keep wanted clusters
mousescrnaseqSO <- subset(mousescrnaseqSO, subset= ClusterName %in% mouseclusterstokeep)
#add metadata to mousescrnaseqSO
mousescrnaseqSO$MouseorFish <- c("mouse")
mousescrnaseqSO$ClusterInterp3 <- mousescrnaseqSO$ClusterName
#add metadata to forebrain.integrated
forebrain.integrated.mouse$MouseorFish <- c("zebrafish")
forebrain.integrated.mouse$ClusterInterp3 <- forebrain.integrated.mouse$clusterInterp2
#find genes that are in both mouse and zebrafish objects
sharedgenes <- intersect(forebrain.integrated.mouse@assays$RNA@counts@Dimnames[[1]], mousescrnaseqSO@assays$RNA@counts@Dimnames[[1]])
#remove pvalb
sharedgenes <- sharedgenes[! sharedgenes %in% c("Pvalb")]
#subset zebrafish to only include shared genes
forebrain.integrated.mouse@assays$RNA@counts <- forebrain.integrated.mouse@assays$RNA@counts[sharedgenes, ]
#remove integrated assay from zebrafish object
forebrain.integrated.mouse[['integrated']] <- NULL
#subset mouse to only include shared genes
mousescrnaseqSO@assays$RNA@counts <- mousescrnaseqSO@assays$RNA@counts[sharedgenes, ]
#merge objects
zebrafishmouseobject <- merge(forebrain.integrated.mouse, y = c(mousescrnaseqSO), add.cell.ids = c("zebrafish", "mouse"))
#save object
save(zebrafishmouseobject, file = "zebrafishmousemergedobject.RObj")#change active identity
zebrafishmouseobject <- SetIdent(zebrafishmouseobject, value = zebrafishmouseobject@meta.data$MouseorFish)
#stores mitochondrial counts in $percent.mt and then subsets to remove cells with less than 200 genes and those in which >6% of transcript counts were derived from mitochondrial-encoded genes
zebrafishmouseobject[["percent.mt"]] <- PercentageFeatureSet(zebrafishmouseobject, pattern = "^mt-") #adds column with percent.mt
#QC metrics before filtering
VlnPlot(zebrafishmouseobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, raster=FALSE)#subsets based on features/cell and percent mt
zebrafishmouseobject <- subset(zebrafishmouseobject, subset = nFeature_RNA > 200 & percent.mt < 6)
#QC metrics after filtering
VlnPlot(zebrafishmouseobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, raster=FALSE)zebrafishmouseobject <- SCTransform(zebrafishmouseobject, vars.to.regress = "percent.mt", verbose = FALSE)zebrafishmouseobject <- RunPCA(zebrafishmouseobject)
#how many PCs should we use?
ElbowPlot(zebrafishmouseobject)zebrafishmouseobject <- RunUMAP(zebrafishmouseobject, dims = 1:30, n.neighbors=5L)
zebrafishmouseobject <- FindNeighbors(zebrafishmouseobject, dims = 1:30)
zebrafishmouseobject <- FindClusters(zebrafishmouseobject, resolution = 1.7)## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 75324
## Number of edges: 2514556
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8969
## Number of communities: 61
## Elapsed time: 23 seconds
#plot by species
DimPlot(zebrafishmouseobject, reduction = 'umap', group.by = "MouseorFish", raster=FALSE) + ggtitle("")#load object
load("zebrafishmousemergedobject.RObj")
#split into list by species
forebrain.split <- SplitObject(zebrafishmouseobject, split.by = "MouseorFish")#stores mitochondrial counts in $percent.mt and then subsets to remove cells with less than 200 genes and those in which >6% of transcript counts were derived from mitochondrial-encoded genes
forebrain.split <- lapply(X = forebrain.split, FUN = function(x) {
x[["percent.mt"]] <- PercentageFeatureSet(x, pattern = "^mt-") #adds column with percent.mt
x <- subset(x, subset = nFeature_RNA > 200 & percent.mt < 6) #subsets based on features/cell and percent mt
})#sctransform normalization
#Note that this single command replaces NormalizeData(), ScaleData(), and FindVariableFeatures()
forebrain.split <- lapply(X = forebrain.split, FUN = SCTransform, vars.to.regress = "percent.mt")##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 5%
|
|====== | 9%
|
|========== | 14%
|
|============= | 18%
|
|================ | 23%
|
|=================== | 27%
|
|====================== | 32%
|
|========================= | 36%
|
|============================= | 41%
|
|================================ | 45%
|
|=================================== | 50%
|
|====================================== | 55%
|
|========================================= | 59%
|
|============================================= | 64%
|
|================================================ | 68%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|========================================================= | 82%
|
|============================================================ | 86%
|
|================================================================ | 91%
|
|=================================================================== | 95%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 5%
|
|====== | 9%
|
|========== | 14%
|
|============= | 18%
|
|================ | 23%
|
|=================== | 27%
|
|====================== | 32%
|
|========================= | 36%
|
|============================= | 41%
|
|================================ | 45%
|
|=================================== | 50%
|
|====================================== | 55%
|
|========================================= | 59%
|
|============================================= | 64%
|
|================================================ | 68%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|========================================================= | 82%
|
|============================================================ | 86%
|
|================================================================ | 91%
|
|=================================================================== | 95%
|
|======================================================================| 100%
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 4%
|
|====== | 9%
|
|========= | 13%
|
|============ | 17%
|
|=============== | 22%
|
|================== | 26%
|
|===================== | 30%
|
|======================== | 35%
|
|=========================== | 39%
|
|============================== | 43%
|
|================================= | 48%
|
|===================================== | 52%
|
|======================================== | 57%
|
|=========================================== | 61%
|
|============================================== | 65%
|
|================================================= | 70%
|
|==================================================== | 74%
|
|======================================================= | 78%
|
|========================================================== | 83%
|
|============================================================= | 87%
|
|================================================================ | 91%
|
|=================================================================== | 96%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 4%
|
|====== | 9%
|
|========= | 13%
|
|============ | 17%
|
|=============== | 22%
|
|================== | 26%
|
|===================== | 30%
|
|======================== | 35%
|
|=========================== | 39%
|
|============================== | 43%
|
|================================= | 48%
|
|===================================== | 52%
|
|======================================== | 57%
|
|=========================================== | 61%
|
|============================================== | 65%
|
|================================================= | 70%
|
|==================================================== | 74%
|
|======================================================= | 78%
|
|========================================================== | 83%
|
|============================================================= | 87%
|
|================================================================ | 91%
|
|=================================================================== | 96%
|
|======================================================================| 100%
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = forebrain.split, nfeatures = 3000)
forebrain.split <- PrepSCTIntegration(object.list = forebrain.split, anchor.features = features)
#takes a list of Seurat objects as input, and use these anchors to integrate the two datasets together with IntegrateData().
forebrain.anchors <- FindIntegrationAnchors(object.list = forebrain.split, normalization.method = "SCT",
anchor.features = features)
forebrain.combined.sct <- IntegrateData(anchorset = forebrain.anchors, normalization.method = "SCT")
#switch default assay
DefaultAssay(forebrain.combined.sct) <- "integrated"
#standard analysis
forebrain.combined.sct <- RunPCA(forebrain.combined.sct, verbose = FALSE)
forebrain.combined.sct <- RunUMAP(forebrain.combined.sct, reduction = "pca", dims = 1:30)
#plot by species
DimPlot(forebrain.combined.sct, reduction = 'umap', group.by = "MouseorFish", raster=FALSE) + ggtitle("")#labeled by original clusters
DimPlot(forebrain.combined.sct, reduction = "umap", group.by = "clusterInterp2", raster=FALSE, label=T, repel=T) + ggtitle("Integrated by Cluster Name")DimPlot(forebrain.combined.sct, reduction = "umap", group.by = "ClusterName", raster=FALSE, label=T, repel=T) + ggtitle("Integrated by Cluster Name")forebrain.combined.sct <- forebrain.combined.sct %>%
RunHarmony("MouseorFish", assay.use="SCT")
# UMAP and clustering with harmonized PCs
forebrain.combined.sct <- RunUMAP(forebrain.combined.sct, reduction='harmony', dims = 1:30)
forebrain.combined.sct <- FindNeighbors(forebrain.combined.sct, reduction='harmony')
forebrain.combined.sct <- FindClusters(forebrain.combined.sct, resolution = 0.35)## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 68274
## Number of edges: 1991077
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9116
## Number of communities: 11
## Elapsed time: 28 seconds
load(file="forebrain.combined.sct.RObj")
#get list of colors
hex <- sample(hue_pal()(24))
#plot by species
DimPlot(forebrain.combined.sct, reduction = 'umap', group.by = "MouseorFish", raster=FALSE) + ggtitle("")colsorig <- c('cyan4', 'blue1', 'darkturquoise', 'yellowgreen', 'mediumspringgreen', 'lightskyblue', 'royalblue1', 'green1', 'magenta1', 'purple','lightcoral', 'thistle1', 'palevioletred1', 'darkviolet', 'maroon1', 'tomato1', 'slateblue1')
colsorig <- unlist(lapply(colsorig, adjustcolor, alpha= 1.0))
#labeled by original clusters
DimPlot(forebrain.combined.sct, cols = colsorig, reduction = "umap", group.by = "clusterInterp2", raster=FALSE, label=T, repel=T, na.value="grey70") + ggtitle("") + theme(legend.position = 'none')#graph by renamed mouse clusters
forebrain.combined.sct <- SetIdent(forebrain.combined.sct, value = forebrain.combined.sct$ClusterName)
forebrain.combined.sct <- RenameIdents(forebrain.combined.sct, c("TEGLU1" = "Isocortex", "TEGLU3" = "Isocortex", "TEGLU2" = "Isocortex", "TEGLU20" = "Isocortex", "TEGLU11" = "Isocortex", "TEGLU12" = "Isocortex", "TEGLU10" = "Isocortex", "TEGLU9" = "Isocortex", "TEGLU8" = "Isocortex", "TEGLU7" = "Isocortex", "TEGLU6" = "Isocortex", "TEGLU13" = "Subiculum", "TEGLU14" = "Subiculum", "TEGLU5" = "Entorhinal", "TEGLU16" = "Piriform pyramidal", "TEGLU15" = "Piriform pyramidal", "TEGLU17" = "Piriform pyramidal", "TEGLU18" = "Anterior olfactory nucleus", "TEGLU19" = "Anterior olfactory nucleus", "TEGLU22" = "Basolateral amygdala", "TEGLU21" = "CA1, subiculum, entorhinal", "TEGLU4" = "Isocortex", "TEGLU24" = "CA1", "TEGLU23" = "CA3", "CR" = "Cajal-Retzius cells", "DECHO1" = "Cholinergic", "MSN1" = "D1 medium spiny neurons", "MSN2" = "D2 medium spiny neurons", "MSN3" = "D2 medium spiny neurons", "MSN4" = "D1 medium spiny neurons", "MSN5" = "D1 medium spiny neurons", "MSN6" = "D1 medium spiny neurons", "TEINH17" = "Axo-axonic interneurons", "TEINH18" = "Basket and bistratified cells", "TEINH19" = "Hippocamposeptal projection interneurons", "TEINH21" = "Long-range projection interneurons", "TEINH16" = "MGE-derived neurogliaform", "TEINH15" = "CGE-derived neurogliaform", "TEINH14" = "CGE-derived neurogliaform", "TEINH20" = "Inhibitory interneurons", "TEINH13" = "Trilaminar cells", "TEINH12" = "Cck interneurons", "TEINH9" = "Cck interneurons", "TEINH10" = "Cck interneurons", "TEINH11" = "Cck interneurons", "TEINH4" = "Interneuron-selective interneurons", "TEINH5" = "Interneuron-selective interneurons", "TEINH8" = "Interneuron-selective interneurons", "TEINH7" = "Interneuron-selective interneurons", "TEINH6" = "Interneuron-selective interneurons", "TECHO" = "Cholinergic", "TEINH3" = "Inhibitory interneurons", "TEINH2" = "Inhibitory interneurons", "TEINH1" = "Inhibitory neurons, pallidum"))
set.seed(9)
hex = sample(c('mistyrose2', 'chartreuse1' , 'chocolate3', 'deepskyblue1',
'rosybrown3','gold1', 'orange1', 'sandybrown', 'coral', 'navajowhite1',
'lightsteelblue2', 'darkseagreen1',
'lightcyan1', 'darkslategray1', 'pink1', 'thistle1',
'cyan4', 'darkturquoise', 'yellowgreen', 'mediumspringgreen', 'royalblue1', 'green1', 'magenta1',
'purple','lightcoral', 'orchid1', 'palevioletred1', 'darkviolet', 'maroon1',
'tomato1', 'slateblue1','red','burlywood2','olivedrab1'))
DimPlot(forebrain.combined.sct, cols = hex, reduction = "umap", raster=FALSE, label=T, repel=T, na.value="grey70") + ggtitle("")#make csv with integrated markers
#integratedmarkers <- FindAllMarkers(forebrain.combined.sct, only.pos = T, logfc.threshold = 0.65)
#write.csv(as.data.frame(integratedmarkers), "integratedmarkerswopvalbresolution2.csv")
library(patchwork)
FeaturePlot(forebrain.combined.sct, features = c('Sst', "Tac1", "Nxph1", "Penk", "Lhx8", "Rprm", "Cbln1", "Nell2", "Jun", "Nptx1", "Ucn", "Egr2", "Efna1", "Emx2", "Zic2"), min.cutoff = 0, ncol = 3) Packages and versions necessary to reproduce the results in this report.
sessionInfo()## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux
##
## Matrix products: default
## BLAS/LAPACK: /data/rc/apps/rc/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib64/libflexiblas.so.3.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.2.1 harmony_0.1.1 Rcpp_1.0.10
## [4] sctransform_0.3.5 ggplot2_3.4.0 Matrix_1.5-3
## [7] SeuratDisk_0.0.0.9020 loomR_0.2.0 itertools_0.1-3
## [10] iterators_1.0.14 hdf5r_1.3.7 R6_2.5.1
## [13] patchwork_1.1.2 SeuratObject_4.1.3 Seurat_4.3.0
## [16] dplyr_1.0.10 knitr_1.41 tinytex_0.43
## [19] rmarkdown_2.20
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.7.1 Rtsne_0.16
## [3] colorspace_2.1-0 deldir_1.0-6
## [5] ellipsis_0.3.2 ggridges_0.5.4
## [7] rstudioapi_0.14 spatstat.data_3.0-0
## [9] farver_2.1.1 leiden_0.4.3
## [11] listenv_0.9.0 bit64_4.0.5
## [13] ggrepel_0.9.3 fansi_1.0.4
## [15] codetools_0.2-18 splines_4.2.0
## [17] cachem_1.0.6 polyclip_1.10-4
## [19] jsonlite_1.8.4 ica_1.0-3
## [21] cluster_2.1.4 png_0.1-8
## [23] uwot_0.1.14 shiny_1.7.4
## [25] spatstat.sparse_3.0-0 compiler_4.2.0
## [27] httr_1.4.4 assertthat_0.2.1
## [29] fastmap_1.1.0 lazyeval_0.2.2
## [31] cli_3.6.0 later_1.3.0
## [33] htmltools_0.5.4 tools_4.2.0
## [35] igraph_1.3.5 gtable_0.3.1
## [37] glue_1.6.2 RANN_2.6.1
## [39] reshape2_1.4.4 scattermore_0.8
## [41] jquerylib_0.1.4 vctrs_0.5.2
## [43] nlme_3.1-161 spatstat.explore_3.0-5
## [45] progressr_0.13.0 lmtest_0.9-40
## [47] spatstat.random_3.1-0.008 xfun_0.36
## [49] stringr_1.5.0 globals_0.16.2
## [51] mime_0.12 miniUI_0.1.1.1
## [53] lifecycle_1.0.3 irlba_2.3.5.1
## [55] goftest_1.2-3 future_1.30.0
## [57] MASS_7.3-58.1 zoo_1.8-11
## [59] promises_1.2.0.1 spatstat.utils_3.0-1
## [61] parallel_4.2.0 RColorBrewer_1.1-3
## [63] yaml_2.3.6 reticulate_1.27
## [65] pbapply_1.7-0 gridExtra_2.3
## [67] ggrastr_1.0.1 sass_0.4.4
## [69] stringi_1.7.12 highr_0.10
## [71] rlang_1.0.6 pkgconfig_2.0.3
## [73] matrixStats_0.63.0 evaluate_0.20
## [75] lattice_0.20-45 ROCR_1.0-11
## [77] purrr_1.0.1 tensor_1.5
## [79] labeling_0.4.2 htmlwidgets_1.6.1
## [81] bit_4.0.5 cowplot_1.1.1
## [83] tidyselect_1.2.0 parallelly_1.34.0
## [85] RcppAnnoy_0.0.20 plyr_1.8.8
## [87] magrittr_2.0.3 generics_0.1.3
## [89] DBI_1.1.3 withr_2.5.0
## [91] pillar_1.8.1 fitdistrplus_1.1-8
## [93] survival_3.4-0 abind_1.4-5
## [95] sp_1.6-0 tibble_3.1.8
## [97] future.apply_1.10.0 crayon_1.5.2
## [99] KernSmooth_2.23-20 utf8_1.2.3
## [101] spatstat.geom_3.0-4.001 plotly_4.10.1
## [103] grid_4.2.0 data.table_1.14.6
## [105] digest_0.6.31 xtable_1.8-4
## [107] tidyr_1.2.1 httpuv_1.6.8
## [109] munsell_0.5.0 beeswarm_0.4.0
## [111] viridisLite_0.4.1 vipor_0.4.5
## [113] bslib_0.4.2